clear 
clear matrix
set more off
set varabbrev off

cap cd "/Users/kevincroke/Dropbox/UG Uwezo/Dataverse/"

/*

**********************************************
This section takes raw data from Uwezo website for 2010-2015, 
harmonizes variable names, and appends it into one file.
Then variables used in the analysis are created. 

Stata version 13.1
**********************************************

Note: Insert a command changing directory to a project folder,
with "raw_data" and contructed data subfolders.

*/   

********
*prep 2010 data for appending 
*********
   
use "raw_data/UG10_hhld.dta", clear

gen rd=2010
gen rd_2010=1

*create composite literacy score
replace english1=0 if english1==. & english!=. 
gen english_sum=english+english1

*make variables consistent across survey rounds
rename id_hh hhid 
rename id_districtName district
gen district1=lower(district)
rename asset_* *
rename elec electricity

rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
gen inschool=enr_ans
gen neverattended=neverenrolled
rename hh_gender hh_sex
rename mothers_edu mother_edu 
rename mothers_age mother_age

save "constructed_data/UG10_append_hh.dta", replace

*********
*prep 2011 data for appending
*********

use "raw_data/UG11_hhld.dta", clear

destring parishcode, replace

gen rd=2011
gen rd_2011=1

*create composite literacy score
gen english_sum=english+english1

*make variables consistent across survey rounds
rename id_hh hhid
rename id_districtName district
gen district1=lower(district)

rename asset_* *
rename elec electricity
rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
gen inschool=enr_ans
gen neverattended=neverenrolled
 
rename hh_gender hh_sex
rename mothers_edu mother_edu
rename mothers_age mother_age

save "constructed_data/UG11_append_hh.dta", replace

**********
*prep 2012 data for appending
**********

use "raw_data/UG12_hhld.dta", clear

gen rd=2012
gen rd_2012=1
 
*create composite literacy score
gen english_sum=english+english1

*make variables consistent across survey rounds
rename id_hh hhid
rename parish parishname
rename id_districtName district
gen district1=lower(district)

rename asset_* *
rename elec electricity
sum electricity water radio tv phone motorbike bicycle
rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
 
gen inschool=enr_ans
gen neverattended=neverenrolled
 
rename hh_gender hh_sex
rename mothers_edu mother_edu
rename mothers_age mother_age

save "constructed_data/UG12_append_hh.dta", replace

use "raw_data/UG13_hhld.dta", clear 
sort eacode

/*
Merge using the link file provded via email by Uwezo Uganda staff
so that 2013 data will have a parish name and code
*/
 
merge m:1 eacode using "raw_data/VillageID2013_for_merge.dta"
drop _merge

gen rd=2013
gen rd_2013=1

*create composite literacy score
replace english1=0 if english1==2
replace english1=0 if english1==.
gen english_sum=english+english1

*make variables consistent across survey rounds
rename id_hh hhid
rename id_districtName district
gen district1=lower(district)

rename asset_* *
rename elec electricity
rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
gen inschool=enr_ans
gen neverattended=neverenrolled 
rename hh_gender hh_sex
rename mothers_edu mother_edu
rename mothers_age mother_age
drop home_language // need to drop, otherwise prevents appending

sort id_village

save "constructed_data/UG13_append_hh.dta", replace

**********
*prep 2014 data for appending
**********

use "raw_data/UG14_hhld.dta", clear
sort id_village
 
*merge using 2014 link file from Uwezo Uganda
merge m:1 id_village using "raw_data/VillageID2014_for_merge.dta"
drop _merge

gen rd=2014
gen rd_2014=1

*create composite literacy score
replace english1=0 if english1 ==2
replace english1=0 if english1==.
gen english_sum=english+english1

*make variables consistent across survey rounds
rename id_hh hhid
rename id_districtName district
gen district1=lower(district)

rename asset_* *
rename elec electricity
rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
gen inschool=enr_ans
gen neverattended=neverenrolled 
rename hh_gender hh_sex
rename mothers_edu mother_edu
drop home_language // need to drop, otherwise prevents appending

save "constructed_data/UG14_append_hh.dta", replace

**********
*prep 2015 data for appending
**********

use "raw_data/UG15_hhld.dta", clear

gen rd=2015
gen rd_2015=1

*create composite literacy variable
replace english1 =0 if english1 ==2
replace english1=0 if english1==. & age>=7
gen english_sum=english+english1

*make variables consistent across rounds
rename id_hh hhid
rename id_districtName district
gen district1=lower(district)
rename parish_name parishname
drop sub_location

rename asset_* *
rename elec electricity
rename grade class
gen public= (schooltype==1) if !missing(schooltype)
gen private= (schooltype==2) if !missing(schooltype)
 
gen inschool=enr_ans
gen neverattended=neverenrolled 
rename hh_gender hh_sex
rename mothers_edu mother_edu
drop home_language // need to drop, otherwise prevents appending

save "constructed_data/UG15_append_hh.dta", replace

use "constructed_data/UG10_append_hh.dta", clear

forvalues i=11/15 {
append 	using 	"constructed_data/UG`i'_append_hh.dta"
}

save 			"constructed_data/UG10_11_12_13_14_15_hh.dta", replace 

*****************
*create variables for analysis
*****************

 
*round dummies have are coded as 1 or missing, code them as zero if =.
 sum rd_*
 foreach var in rd_2010 rd_2011 rd_2012 rd_2013 rd_2014 rd_2015 {
 replace `var'=0 if `var'==.
 }
 
foreach rd in 0 1 2 3 {
gen age_0`rd' = . 
foreach yr in 0 1 2 3 4 5 {
replace age_0`rd'= 200`rd'-(201`yr'-age) if rd_201`yr'==1
}
}
 
tab rd, gen(d_rd) //generate survey round dummy variables

*create a variable that equals 1 for each year that a child was exposed to treated
forvalues i=0/3 {
gen treat0`i'= (age_0`i'>=1 & age_0`i'<=7)
label var treat0`i' "respondent eligible for treatment in 200`i'"
}

*sum this variable to create total years of treatment
egen years_treated=rowtotal(treat00 treat01 treat02 treat03)

 *generate treatment indicator
 
 gen treat= inlist(parishname, "NAJENITI", "KADOKOLENE", "TEKWANA", "KAKUTU", "BULANGIRA") | ///
 inlist(parishname, "BUKIBOLOGOTO", "BUMASIKYE", "BUNAKHAYOTI", "BUNAMUBI", "BUMATANDA") | ///
 inlist(parishname, "NSINSE", "KISOWEZI", "WAIRAMA", "KIGULAMO", "NAWANGAIZA") | ///
 inlist(parishname, "SONE", "WANGALE", "NAMWANGA", "KIREWA", "MULANGI", "KAYORO", "NYALAKOT") | ///
 inlist(parishname, "MAWERO", "HASYULE", "JINJA") 

 gen control= inlist(parishname, "GOLIGOLI", "KAKORO", "KAGUMU", "KASODO", "KITAIKAWONANI") | ///
 inlist(parishname, "GIBUZALE", "BUNANIMI", "BULOBI", "BUFUKHULA", "BUNABWANA") |  ///
 inlist(parishname, "BUBAGO", "BUWALIRA", "BUWAISWA", "MAKANDWA", "BUGONGO") | ///
 inlist(parishname, "MIFUMI", "SENDA", "BUSABA", "NABIYOGA", "KAPISA", "MORUKATIPE", "OSUKURU") | ///
 inlist(parishname, "BUTEBA", "LUMINO",  "MAJANJI") 

 *check that all treat and control parishes are in the correct region
tab id_regionName if treat==1 | control==1 // 32 obs not in Eastern region of Uganda. 
tab id_regionName if parishname=="BUGONGO"
tab district if id_regionName=="Western" // there is a "Bugongi" parish in Mitooma district, see http://www.lcmt.org/uganda/mitooma/mitooma
replace control=0 if parishname=="BUGONGO" & id_regionName=="Western"

*generate a parishcode variable that indexes parishes
egen parishcode1=group(parishname)

*create study district dummy variable
replace district1="bududa" if district1=="buduuda" //correct inconsistent spelling

gen study_district= (district1=="pallisa" | district1=="tororo" | district1=="busia" | district1=="iganga" ///
| district1=="mbale" | district1=="bududa" | district1=="manafwa" | district1=="sironko" | district1=="mayuge" ///
| district1=="butaleja" | district1=="kibuku" | district1=="namutumba") if !missing(district1)

*create indicator for study parishes
gen study_parish= (treat==1 | control==1)  

*****************
*create main outcome variables
****************

gen total_score=math+english 

*create dummies for respondents for missing outcome variables
gen missing_math= (math==.) & study_parish==1
gen missing_english= (english==.) & study_parish==1
gen missing_obs= ((math==.) | (english==.)) & study_parish==1
replace missing_obs=. if study_parish==0

*now define the study cohort, e.g. anyone between age 1-7 in 2000-2003
gen age1_7=1 if age_00<=7 & age_00>=1
replace age1_7=1 if age_01<=7 & age_01>=1
replace age1_7=1 if age_02<=7 & age_02>=1
replace age1_7=1 if age_03<=7 & age_03>=1
replace age1_7=0 if age1_7==.

*************************
*create control variables
*************************
gen female_head= (hh_sex==2) & !missing(hh_sex)
gen female= (gender==2) 
tab age, gen(d_age) 
rename fathers_edu father_edu
gen mother_no_edu= 1 if mother_edu==0
replace mother_no_edu=0 if mother_edu==1 | mother_edu==2 | mother_edu==3
gen mother_postprimary= (mother_edu==2 | mother_edu==3)
replace mother_postprimary=. if mother_edu==.

*Create district dummies
tab district1 if study_parish==1, gen(district_dum)

*drop non-study parishes
drop if study_parish==0

*create wealth index
tabstat radio phone tv electricity water bicycle motorbike, by(rd) // we omit bicycle and motorbike from PCA b/c they are absent in 2010, and water because it is inconsistently defined across rounds.
qui pca radio phone tv electricity 
qui predict f1
rename f1 wealthindex_2015
label variable wealthindex_2015 "Wealth index"
sort wealthindex_2015
xtile quintile=wealthindex_2015,nq(5)

gen poorest_2015= (quintile==1) 
gen poor_2015= (quintile==2)
gen average_2015= (quintile==3) 
gen rich_2015= (quintile==4) 
gen richest_2015= (quintile==5)
gen bottom_2_2015= (quintile<3) 
lab var bottom_2_2015 "low assets"

global wealth="poorest_2015 poor_2015  average_2015 rich_2015 richest_2015"
label var private "private school"

*standardize test scores
egen st_total=std(total_score)
egen st_math=std(math)
egen st_english=std(english)


******
/*create interaction variables  for heterogeneity analysis
1. gender
2. poverty/low assets
3. years of program eligibility 
4. primary school age (under 14) at time of survey
5. intra-household spillovers
6. Age at program eligibility
*/
******

*1. gender interaction

gen treat_female=treat*female
label var treat_female "treat x female"


*2. poverty/ low assets interaction

gen treat_poor=treat*bottom_2_2015
label var treat_poor "treat x low assets"


*3. years of program eligibility interaction 

gen expose2_4 = (years_treated==2 | years_treated==3 | years_treated==4)
gen treat_expose2_4=treat*expose2_4
label var treat_expose2_4 "treat x 2+ years exposure"


*4. primary school age (under 14) interaction 

gen under14= (age<14)
gen treat_under14=treat*under14
label var treat_under14 "treat x under 14"
gen over14=1-under14

*5. Intra-household spillovers interaction 

*First define individual as belonging to treatment cohort
gen treat_cohort= (age1_7==1)

/*
Since it is not clear if the "hhid" variable is completely unique by survey rd,
we calculate number treated per hh separately by each round
*/

 forvalues i=2010/2015 {
bys hhid: egen num_treated_`i'=total(treat_cohort) if rd==`i'
}

gen num_treated=num_treated_2010 if rd==2010

forvalues i=2011/2015 {
replace num_treated=num_treated_`i' if rd==`i'
}

*now create "others treated" which takes the total number treatment eligible from hh and subtracts 1
gen others_treated=num_treated-1 if num_treated>0 
replace others_treated=0 if num_treated==0
gen spillover_treat= treat*others_treated
label var spillover_treat "treat x others treated"

*6. age at first exposure interaction 

*first create variable telling us the year the child started being eligible for treatment 
gen start_00= (treat00==1)
gen start_01= (treat01==1 & start_00==0)
gen start_02= (treat02==1 & start_00==0 & start_01==0)
gen start_03= (treat03==1 & start_00==0 & start_01==0 & start_02==0)

*now generate a variable for age at the start of eligibility for treatment
gen age_at_start=age_00 if start_00==1
forvalues i=1/3 {
replace age_at_start=age_0`i' if start_0`i'==1
}

gen start1= (age_at_start==1)
gen age1_treat=treat*start1
label var age1_treat "treat x program eligible at age 1"


save 			"constructed_data/for_analysis.dta", replace 
